{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Processing QM9 dataset (134 k molecules) \n",
    "This notebook shows how to import the [QM9 dataset](https://figshare.com/articles/Data_for_6095_constitutional_isomers_of_C7H10O2/1057646) and generate featurizations using the MML toolkit, saving the feature vectors in .csv format. \n",
    "\n",
    "The filemformat from the paper\n",
    "Raghunathan Ramakrishnan, Pavlo O. Dral, Matthias Rupp & O. Anatole von Lilienfeld. \"[Quantum chemistry structures and properties of 134 kilo molecules](https://www.nature.com/articles/sdata201422) \" *Scientific Data* **1**, 140022 (2014)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create file list "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 162,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "133885\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import os \n",
    "\n",
    "base_dir = '/home/delton/Downloads/134k_molecules_QM9/'\n",
    "file_list = os.listdir(base_dir)\n",
    "num_mols = len(file_list)\n",
    "print(num_mols)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create function to read the authors \".xyz\" format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_xyz(file_name):\n",
    "    with open(file_name, 'rb') as file:\n",
    "        num_atoms = int(file.readline())\n",
    "        properties = file.readline().split()[1:17]\n",
    "        properties = [num.replace(b'*^', b'e') for num in properties] \n",
    "        properties = [float(prop) for prop in properties]\n",
    "        atom_types = [0]*num_atoms\n",
    "        coords = np.array(np.zeros([num_atoms,3]))\n",
    "        for na in range(num_atoms):\n",
    "            coord_line = file.readline().split()\n",
    "            atom_types[na] = coord_line[0]\n",
    "            xyz_coords = coord_line[1:4]\n",
    "            xyz_coords = [num.replace(b'*^', b'e') for num in xyz_coords] \n",
    "            coords[na,:] = [float(num) for num in xyz_coords]  \n",
    "        vib_freqs = file.readline()\n",
    "        smiles = file.readline().split()[0]\n",
    "        inchis = file.readline()\n",
    "        \n",
    "    return smiles, properties, atom_types, coords\n",
    "        \n",
    "    \n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read in data to lists"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "size of largest molecule =  29\n"
     ]
    }
   ],
   "source": [
    "smiles = [0]*num_mols\n",
    "properties = [0]*num_mols \n",
    "atom_types = [0]*num_mols\n",
    "coords = [0]*num_mols\n",
    "\n",
    "for im in range(num_mols):\n",
    "    smiles[im], properties[im], atom_types[im], coords[im] = read_xyz(base_dir+file_list[im])\n",
    "\n",
    "biggest_mol_size = max([len(atom_list) for atom_list in atom_types])\n",
    "\n",
    "print(\"size of largest molecule = \", biggest_mol_size)    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Look at atom types"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{b'O', b'N', b'H', b'F', b'C'}\n"
     ]
    }
   ],
   "source": [
    "all_atoms = []\n",
    "for atoms in atom_types:\n",
    "    all_atoms += atoms\n",
    "print(set(all_atoms))\n",
    "del all_atoms"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## import mmltookit Coulomb matrix function and test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 1.77473464e+02,  5.95943471e+01,  4.63087225e+01,  2.65512413e+01,\n",
       "        2.29384657e+01,  8.57204543e+00,  7.22889442e+00,  2.50963775e+00,\n",
       "       -1.10897971e+00, -7.92663621e-01, -5.70970218e-01, -4.21445985e-01,\n",
       "       -2.83968522e-01, -6.65690196e-02, -6.55728683e-02,  0.00000000e+00,\n",
       "        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,\n",
       "        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,\n",
       "        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,\n",
       "        0.00000000e+00])"
      ]
     },
     "execution_count": 115,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from mmltoolkit.featurizations import coulombmat_eigenvalues_from_coords\n",
    "\n",
    "coulombmat_eigenvalues_from_coords(atom_types[1], coords[1], biggest_mol_size)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## generate Coulomb matrix eigenvalues"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {},
   "outputs": [],
   "source": [
    "cmat_eigs = np.zeros([num_mols, biggest_mol_size])\n",
    "\n",
    "for im in range(num_mols):\n",
    "    cmat_eigs[im,:] = coulombmat_eigenvalues_from_coords(atom_types[im], coords[im], biggest_mol_size)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(133885, 29)"
      ]
     },
     "execution_count": 133,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cmat_eigs.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## A single txt file that contains all the smiles strings.  \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('QM9_all_smiles.txt', 'wb') as file:\n",
    "    for smile in smiles:\n",
    "        file.write(smile+b'\\n')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## An excel file that contains the eignevalues of all Coulomb matrices.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.savetxt('QM9_CM_eigenvalues.csv', cmat_eigs , delimiter=',')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## An excel file that contains all the eigenvalues of the weight matrices. (bond connections etc)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 182,
   "metadata": {},
   "outputs": [],
   "source": [
    "from rdkit.Chem.rdmolops import GetAdjacencyMatrix \n",
    "\n",
    "#----------------------------------------------------------------------------\n",
    "def adjacency_matrix_eigenvalues(mol_list, useBO=False):\n",
    "\n",
    "    eigenvalue_list = []\n",
    "    max_length = 0\n",
    "\n",
    "    for mol in mol_list:\n",
    "        adj_matrix = GetAdjacencyMatrix(mol, useBO=useBO)\n",
    "        evs = np.linalg.eigvals(adj_matrix)\n",
    "        evs = sorted(evs, reverse=True) #sort\n",
    "        evs = [float(ev) for ev in evs]\n",
    "        eigenvalue_list += [evs]\n",
    "        length = len(evs)\n",
    "        if (length > max_length):\n",
    "            max_length = length\n",
    "\n",
    "    print(max_length)\n",
    "    \n",
    "    #zero padding\n",
    "    for i in range(len(eigenvalue_list)):\n",
    "        pad_width = max_length - len(eigenvalue_list[i])\n",
    "        eigenvalue_list[i] += [0]*pad_width\n",
    "\n",
    "    return np.array(eigenvalue_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 190,
   "metadata": {},
   "outputs": [],
   "source": [
    "#from mmltoolkit.featurizations import adjacency_matrix_eigenvalues\n",
    "from rdkit import Chem\n",
    "\n",
    "mol_list = [Chem.AddHs(Chem.MolFromSmiles(smile)) for smile in smiles]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 191,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/delton/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:13: ComplexWarning: Casting complex values to real discards the imaginary part\n",
      "  del sys.path[0]\n"
     ]
    }
   ],
   "source": [
    "adjmat_eigs = adjacency_matrix_eigenvalues(mol_list, useBO=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 192,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(133885, 29)"
      ]
     },
     "execution_count": 192,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adjmat_eigs.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 194,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.savetxt('QM9_WAM_eigenvalues.csv', adjmat_eigs , delimiter=',')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## An excel file that contains the sum over bonds. \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 195,
   "metadata": {},
   "outputs": [],
   "source": [
    "from mmltoolkit.featurizations import sum_over_bonds\n",
    "\n",
    "bond_types, X_LBoB  = sum_over_bonds(mol_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 196,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "C-C,C-O,C-H,H-O,C:C,C=N,C:N,H-N,C-N,N:N,C=O,C#N,C#C,C=C,N:O,C:O,C-F,N-O,N=O,(133885, 19)\n"
     ]
    }
   ],
   "source": [
    "for bond_type in bond_types:\n",
    "    print(bond_type+',', end='')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 198,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.savetxt('QM9_SoB.csv', X_LBoB.astype('int') , fmt='%i', delimiter=',')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "## An excel file that contains all the properties in the right order."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(133885, 16)"
      ]
     },
     "execution_count": 149,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "properties = np.array(properties)\n",
    "properties.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.savetxt('QM9_properties.csv', properties , delimiter=',')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
